deepspeed  --num_gpus=8   dist/ft.py \
--deepspeed ds.json \
--model_name_or_path checkpoint5 \
--train_file data4 \
--pickle true \
--do_train \
--do_eval false \
--overwrite_cache \
--fp16 true \
--output_dir model \
--num_train_epochs 1 \
--save_steps 3000 \
--gradient_accumulation_steps 1 \
--per_device_train_batch_size 32 \
--learning_rate "1e-6" \
--warmup_steps 2000 \
--overwrite_output_dir